This script for for analyzing heterozygosity of the Stercorariidae samples using data generated by Rohan.
First, I will load some functions that I wrote for handling the output of Rohan.
# This function minimally processes the output of Rohan.
heterozygosity.prep <- function(sample, NumChroms, SpecialChrom){ #requires an input file
hEst <- read.delim(paste(sample,".rohan.hEst.gz", sep=""))
chromosomes.to.plot <- unique(hEst$X.CHROM)
chromosomes.to.plot <- chromosomes.to.plot[1:NumChroms]
hEst <- hEst %>%
filter(X.CHROM %in% chromosomes.to.plot)
hEst$zygosity <- "Black"
for (i in 1:nrow(hEst)){
if (hEst$X.CHROM[i]==SpecialChrom){
hEst$zygosity[i]<- "#ff4f00"
}
hEst$pop<-substr(sample, 1, 4) #extract first 4 letters of sample name as the population identifier
}
return(hEst)
}
# This function produces a scatter plot
heterozygosity.scatter <- function(hEst){ #requires an input file
if ((mean(hEst$h)-(2*sd(hEst$h)))>0){
ylim <- (mean(hEst$h)-(2*sd(hEst$h)))
} else {
ylim <- 0
}
scatterplot <- ggplot(data = hEst)+ #create a scatterplot with ggplot
geom_hline(yintercept = (mean(hEst$h)+(2*sd(hEst$h))))+ #add a horizonatal line to mark the SNP density which is 2 standard deviations above the mean
geom_hline(yintercept = ylim)+ #add a horizonatal line to mark the SNP density which is 2 standard deviations above the mean
geom_point(shape = 20, alpha=1, aes(x = BEGIN/1000000, y = h), color = hEst$zygosity) + #plot very small points which are translucent to avoid overplotting because there are so many data points
theme_classic()+ #simplify plot
facet_grid(~ X.CHROM, scales = "free", space="free", drop=T, shrink=T)+ #plot each chromosome separately
theme(strip.background = element_blank(), strip.text.x = element_blank(), axis.text.x=element_blank())+ #simplify plot, remove title for presentation purposes as it is a distraction and redundant with a caption
xlab('Chromosomes')+ #add x label
coord_cartesian(clip = "off") + #stop the plot from clipping the edges of points in very small chromosomes
ylab('Heterozygosity')+ #label y axis
#ylim(0, 0.01)
scale_x_continuous(expand = c(0, 0), limits = c(0, NA)) + #make sure that the origin is at 0,0
scale_y_continuous(expand = c(0, 0), limits = c(0, NA))
#geom_point(data=archaicfile[archaicfile$Coalescence=="Ancient",], aes(x = start/1000000, y = SNP_density, color=Coalescence), pch=20, size=2) #plot the points in red in a larger size which are above the posterior probability threshold for being "ancient"
return(scatterplot)
}
#This function produces a scatter plot of multiple samples
combination.scatter <- function(hEst, poplabel_list, colour_list){ #requires an input file
if ((mean(hEst$h)-(2*sd(hEst$h)))>0){
ylim <- (mean(hEst$h)-(2*sd(hEst$h)))
} else {
ylim <- 0
}
scatterplot <- ggplot(data = hEst)+ #create a scatterplot with ggplot
#geom_hline(yintercept = (mean(hEst$h)+(2*sd(hEst$h))))+ #add a horizonatal line to mark the SNP density which is 2 standard deviations above the mean
#geom_hline(yintercept = ylim)+ #add a horizonatal line to mark the SNP density which is 2 standard deviations above the mean
#geom_point(shape = 20, size=2, alpha=1, aes(x = BEGIN/1000000, y = h, color=sample)) + #plot very small points which are translucent to avoid overplotting because there are so many data points
geom_line(aes(x = BEGIN/1000000, y = h, group=sample, color=pop))+
#scale_color_discrete(name="Sample", labels=label_list)+
scale_color_discrete(name="Population", labels=poplabel_list, type=colour_list)+
theme_classic()+ #simplify plot
#theme(axis.text.y = element_text(family="Times", face="plain", colour="black", size=10), axis.title.x = element_text(family="Times", face="plain", colour="black", size=14), axis.title.y = element_text(family="Times", face="plain", colour="black", size=14), legend.text = element_text(family="Times", face="italic", colour="black", size=10), legend.title = element_text(family="Times", face="plain", colour="black", size=12))+ #define the text fonts and sizes for presentation purposes
facet_grid(~ X.CHROM, scales = "free", space="free", drop=T, shrink=T)+ #plot each chromosome separately
theme(strip.background = element_blank(), strip.text.x = element_blank(), axis.text.x=element_blank(), axis.line.y.right = element_line())+ #simplify plot, remove title for presentation purposes as it is a distraction and redundant with a caption
xlab('Genomic Position')+ #add x label
coord_cartesian(clip = "off") + #stop the plot from clipping the edges of points in very small chromosomes
ylab('Heterozygosity')+ #label y axis
#ylim(0, 0.00)+
scale_x_continuous(expand = c(0, 0), limits = c(0, NA)) + #make sure that the origin is at 0,0
scale_y_continuous(expand = c(0, 0), limits = c(0, NA))
#geom_point(data=archaicfile[archaicfile$Coalescence=="Ancient",], aes(x = start/1000000, y = SNP_density, color=Coalescence), pch=20, size=2) #plot the points in red in a larger size which are above the posterior probability threshold for being "ancient"
return(scatterplot)
}
# This function creates a line graph for 1 chromosome
combination.line.single <- function(hEst, label_list, colour_list){ #requires an input file and list of names for the legend
if ((mean(hEst$h)-(2*sd(hEst$h)))>0){ #This calculates 2 standard deviations above the mean so you can plot that line if you want
ylim <- (mean(hEst$h)-(2*sd(hEst$h)))
} else {
ylim <- 0
}
scatterplot <- ggplot(data = hEst)+ #create a scatterplot with ggplot
#geom_hline(yintercept = (mean(hEst$h)+(2*sd(hEst$h))))+ #add a horizonatal line to mark the SNP density which is 2 standard deviations above the mean
#geom_hline(yintercept = ylim)+ #add a horizonatal line to mark the SNP density which is 2 standard deviations above the mean
#geom_point(shape = 20, size=2, alpha=1, aes(x = BEGIN/1000000, y = h, color=sample)) + #plot very small points which are translucent to avoid overplotting because there are so many data points
#geom_line(aes(x = BEGIN/1000000, y = h, group=sample, color=sample))+ #plot as a line instead of points, I like this better
#scale_color_discrete(name="Sample", labels=label_list)+
geom_line(aes(x = BEGIN/1000000, y = h, group=sample, color=pop))+ #plot as a line instead of points, I like this better
scale_color_discrete(name="Population", labels=label_list, type=colour_list)+
theme_classic()+ #simplify plot
#theme(axis.text.y = element_text(family="Times", face="plain", colour="black", size=10), axis.title.x = element_text(family="Times", face="plain", colour="black", size=14), axis.title.y = element_text(family="Times", face="plain", colour="black", size=14), legend.text = element_text(family="Times", face="italic", colour="black", size=10), legend.title = element_text(family="Times", face="plain", colour="black", size=12))+ #define the text fonts and sizes for presentation purposes
facet_grid(~ X.CHROM, scales = "free", space="free", drop=T, shrink=T)+ #plot each chromosome separately
theme(strip.background = element_blank(), strip.text.x = element_blank())+ #simplify plot, remove title for presentation purposes as it is a distraction and redundant with a caption
xlab('Position (Mb)')+ #add x label
coord_cartesian(clip = "off") + #stop the plot from clipping the edges of points in very small chromosomes
ylab('Heterozygosity')+ #label y axis
#ylim(0, 0.01)
scale_x_continuous(expand = c(0, 0), limits = c(0, NA)) + #make sure that the origin is at 0,0
scale_y_continuous(expand = c(0, 0), limits = c(0, NA))
#geom_point(data=archaicfile[archaicfile$Coalescence=="Ancient",], aes(x = start/1000000, y = SNP_density, color=Coalescence), pch=20, size=2) #plot the points in red in a larger size which are above the posterior probability threshold for being "ancient"
return(scatterplot)
}
We will read in the data from the rohan.hEst.gz files. The file output by Rohan has this structure:
# column 1: chromosome
# column 2: window start
# column 3: window end
# column 4: number of good sites
# column 5: heterozygosity
# pick a sample or samples to analyze
#sample <- "POJA_MKP1559"
#sample <- "LTJA_MKP990"
#sample <- "CISK_32"
#sample <- "CISK_3"
#sample <- "GRSK_MKP1592"
#sample <- "ANSK7"
#sample <- "PAJA_B20730"
#samples <- c("Alca_torda", "Fratercula_arctica")
#we will analyze all samples together
samples <- c("PAJA_B20730", "PAJA_USNM606730", "LTJA_MKP990", "POJA_4", "POJA_IB2659", "POJA_MKP1559", "GRSK_MKP1592", "GRSK_MKP1593", "CISK2", "CISK3", "CISK55", "CHSK_MKP2451", "ANSK01", "ANSK7", "ANSK8")
#Define how many chromosomes to analyze (any past this number will be ignored, I have a bunch of really tiny scaffolds at the end of the files)
NumChroms <- 26 #the first 25 are autosomes, 26th is the Z
#Define any particular chromosome to plot separately or highlight. it will get coloured gold. Set to NA if none are special.
SpecialChrom <- "chrZ" #Z
# read in the data and combine samples
combination <- heterozygosity.prep(paste0(samples[1], ".c"), NumChroms, SpecialChrom) #start a new dataframe with the first sample
combination$sample <- samples[1] #add a column that states the sample name
for (i in 2:length(samples)){ # go through each sample and add it to the data
print(samples[i])
temp <- heterozygosity.prep(paste0(samples[i], ".c"), NumChroms, SpecialChrom)
temp$sample <- samples[i]
combination <- rbind(temp, combination)
}
## [1] "PAJA_USNM606730"
## [1] "LTJA_MKP990"
## [1] "POJA_4"
## [1] "POJA_IB2659"
## [1] "POJA_MKP1559"
## [1] "GRSK_MKP1592"
## [1] "GRSK_MKP1593"
## [1] "CISK2"
## [1] "CISK3"
## [1] "CISK55"
## [1] "CHSK_MKP2451"
## [1] "ANSK01"
## [1] "ANSK7"
## [1] "ANSK8"
Our first analysis will focus on chrZ. We expect that female samples will show very low heterozygosity across the chromosome, with a spike in the regions flanking the pseudoautosomal region, where paralogous regions of chrW still map.
#define which chromosome we want to look at.
#Define any particular chromosome to plot separately or highlight
SpecialChrom <- "chrZ" #Z
#look specifically at the chromosome of interest
#filter the dataset to include only the chromosome of interest
combination.chr <- combination %>%
filter(X.CHROM==SpecialChrom)
#This list holds the names of the samples that I want in the figure, since my sample codes would be less informative for the reader than the species name
label_list<- c("S. maccormicki", "S. chilensis", "S. antarcticus", "S. skua", "S. longicaudus", "S. parasiticus", "S. pomarinus")
#Note you need to know the order the samples will be plotted in! It is alphabetical by sample name, NOT the order you specified in your vector. eg., S. maccormicki goes before S. chilensis because the samples within S. maccormicki are called "ANSK".
#define the colours that you want to be used for each group
colour_list <- c("#0076b7", "#f29b02", "#2cb7e9", "#d973a9", "#e55501", "#f4e304", "#05a16f")
#Plot the single chromosome
combination.line.single(combination.chr, label_list, colour_list)
### This is a figure I will put in the paper for heterozygosity across the z chromosome
#save the Figure.
#ggsave("FigureS1_chrZ_heterozygosity.pdf", width=3456, height=2234, units="px")
Now we can take a broader look and scan across the whole genome.
#since we are scanning across all the chromosomes, define the order that the chromosomes should be plotted in - we don't want alphabetical, or else chr1 and chr2 will be separated by chr10-19.
combination$X.CHROM <- factor(combination$X.CHROM, levels = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chr23", "chr24", "chr25", "chrZ")) # Reordering group factor levels
#first, we could look at one sample of interest at a time
# create only one sample scatterplot, plot across whole genome and highlight the special chromosome
#heterozygosity.scatter(heterozygosity.prep(paste0(sample, ".a"), NumChroms, SpecialChrom))
#or we could loop through our whole list of samples
# create one scatterplot for each of a list of samples
for (i in 1:length(samples)){
print(samples[i])
temp <- heterozygosity.prep(paste0(samples[i], ".c"), NumChroms, SpecialChrom)
#quartz()
print(heterozygosity.scatter(temp))
}
## [1] "PAJA_B20730"
## [1] "PAJA_USNM606730"
## [1] "LTJA_MKP990"
## [1] "POJA_4"
## [1] "POJA_IB2659"
## [1] "POJA_MKP1559"
## [1] "GRSK_MKP1592"
## [1] "GRSK_MKP1593"
## [1] "CISK2"
## [1] "CISK3"
## [1] "CISK55"
## [1] "CHSK_MKP2451"
## [1] "ANSK01"
## [1] "ANSK7"
## [1] "ANSK8"
#define the colours that you want to be used for each group
colour_list <- c("#0076b7", "#f29b02", "#2cb7e9", "#d973a9", "#e55501", "#f4e304", "#05a16f")
#This plot will show all samples together.
combination.scatter(combination, label_list, colour_list)
#save the figure
#ggsave("FigureS2_genomescan_heterozygosity.pdf", width=5000, height=2234, units="px")
#Alternatively, Loop through every chromosome if you want to examine each one in detail
for (i in 1:NumChroms){
combination.chr <- combination %>%
filter(X.CHROM==levels(combination$X.CHROM)[i])
#quartz()
print(levels(combination$X.CHROM)[i])
print(combination.line.single(combination.chr, label_list, colour_list))
}
## [1] "chr1"
## [1] "chr2"
## [1] "chr3"
## [1] "chr4"
## [1] "chr5"
## [1] "chr6"
## [1] "chr7"
## [1] "chr8"
## [1] "chr9"
## [1] "chr10"
## [1] "chr11"
## [1] "chr12"
## [1] "chr13"
## [1] "chr14"
## [1] "chr15"
## [1] "chr16"
## [1] "chr17"
## [1] "chr18"
## [1] "chr19"
## [1] "chr20"
## [1] "chr21"
## [1] "chr22"
## [1] "chr23"
## [1] "chr24"
## [1] "chr25"
## [1] "chrZ"
Next, we can calculate average heterozygosity of the autosomes (i.e., excluding chrZ)
##########
#calculate average heterozygosity
#Define how many chromosomes to analyze (any past this number will be ignored, I have a bunch of really tiny scaffolds at the end of the files)
NumChroms <- 25 #use the 25 chromosomes, do not use #26 which is the Z
# pick a sample or samples to analyze
samples <- c("PAJA_B20730", "PAJA_USNM606730", "LTJA_MKP990", "POJA_4", "POJA_IB2659", "POJA_MKP1559", "GRSK_MKP1592", "GRSK_MKP1593", "CISK2", "CISK3", "CISK55", "CHSK_MKP2451", "ANSK01", "ANSK7", "ANSK8", "Alca_torda")
#load the data and calculate heterozygosity for each one
heterozygosities <- NULL
for (i in 1:length(samples)){
print(samples[i])
temp <- heterozygosity.prep(paste0(samples[i], ".c"), NumChroms, SpecialChrom)
heterozygosities[i] <- mean(temp$h)*100
print(heterozygosities[i])
}
## [1] "PAJA_B20730"
## [1] 0.2616347
## [1] "PAJA_USNM606730"
## [1] 0.2074129
## [1] "LTJA_MKP990"
## [1] 0.709196
## [1] "POJA_4"
## [1] 0.3051554
## [1] "POJA_IB2659"
## [1] 0.3130811
## [1] "POJA_MKP1559"
## [1] 0.4249971
## [1] "GRSK_MKP1592"
## [1] 0.1324102
## [1] "GRSK_MKP1593"
## [1] 0.1215987
## [1] "CISK2"
## [1] 0.106758
## [1] "CISK3"
## [1] 0.1054413
## [1] "CISK55"
## [1] 0.1120469
## [1] "CHSK_MKP2451"
## [1] 0.146525
## [1] "ANSK01"
## [1] 0.1340762
## [1] "ANSK7"
## [1] 0.121449
## [1] "ANSK8"
## [1] 0.1302776
## [1] "Alca_torda"
## [1] 0.3560451
heterozygosities #numbers are in the same order as the list of samples were in
## [1] 0.2616347 0.2074129 0.7091960 0.3051554 0.3130811 0.4249971 0.1324102
## [8] 0.1215987 0.1067580 0.1054413 0.1120469 0.1465250 0.1340762 0.1214490
## [15] 0.1302776 0.3560451